1.Over - Under Analysis

oo = readRDS("/Users/ilkerkurtulus/Documents/cse-master/ie582/data/hw1/df9b1196-e3cf-4cc7-9159-f236fe738215_odd_details.rds")
oo = data.table(oo)
oo$date = anytime(oo$date)
mm = readRDS("/Users/ilkerkurtulus/Documents/cse-master/ie582/data/hw1/df9b1196-e3cf-4cc7-9159-f236fe738215_matches.rds")
mm = data.table(mm)
mm[, `:=`(c("leagueId", "type"), NULL)]
mm$date = anytime(mm$date)
mm = mm[is.na(mm$score) == FALSE]
mm = mm[, `:=`(c("home_goal", "away_goal"), tstrsplit(score, 
    ":", fixed = TRUE))]
mm$home_goal = as.numeric(mm$home_goal)
mm$away_goal = as.numeric(mm$away_goal)
mm[, `:=`(total_goals, home_goal + away_goal)]
mm = mutate(mm, is_over = ifelse(total_goals > 2.5, 1, 0))
mm = data.table(mm)
head(mm)
##     matchId        home            away score                date
## 1: KjF6FiA6   tottenham manchester city   0:0 2010-08-14 15:45:00
## 2: ILVbJgQm aston villa        west ham   3:0 2010-08-14 18:00:00
## 3: SGIEDVvJ      wolves      stoke city   2:1 2010-08-14 18:00:00
## 4: YwL5xFHJ      bolton          fulham   0:0 2010-08-14 18:00:00
## 5: lQJAEBPC       wigan       blackpool   0:4 2010-08-14 18:00:00
## 6: byRcHDuf  sunderland      birmingham   2:2 2010-08-14 18:00:00
##    home_goal away_goal total_goals is_over
## 1:         0         0           0       0
## 2:         3         0           3       1
## 3:         2         1           3       1
## 4:         0         0           0       0
## 5:         0         4           4       1
## 6:         2         2           4       1

Different handicap values mean different events with different probability. So we should not considered them together. Thats why from “ah” betType i will pick totalhandicap = 0 and for “ou” betType lets pick 2.5

While choosing bookmakers we need to care that bookmakers should provide odds of above handicap and bettype. To do that lets print them and choose:

oo[(betType == "ou") & (totalhandicap == "2.5"), .N, by = bookmaker]
##            bookmaker     N
##  1:            1xBet 10187
##  2:      bet-at-home 11838
##  3:           bet365 13809
##  4:          Betclic 12011
##  5:        BetVictor 14063
##  6:           Betway 11041
##  7:             bwin 11528
##  8:           Expekt  9664
##  9:      Paddy Power 12492
## 10:           Unibet 10905
## 11:     William Hill  5494
## 12:           youwin 11280
## 13:          Betsafe 14853
## 14:          Betsson 15276
## 15:      Sportingbet 10672
## 16:           Tipico 10046
## 17:         Pinnacle 14610
## 18:            10Bet 36443
## 19:            12BET 11148
## 20:           188BET 12306
## 21:           ComeOn 11497
## 22:           SBOBET 10418
## 23:      Interwetten  7279
## 24:         888sport  7319
## 25:          Betfair  6219
## 26: Betfair Exchange 14914
##            bookmaker     N
oo[(betType == "ah") & (totalhandicap == "0"), .N, by = bookmaker]
##            bookmaker     N
##  1:            1xBet 10584
##  2:           bet365 14846
##  3:      Interwetten  7285
##  4:           Unibet  5283
##  5:        BetVictor  6863
##  6:      Paddy Power  4972
##  7:         Pinnacle  9853
##  8: Betfair Exchange 12309
##  9:            10Bet  9758
## 10:           188BET  6847
## 11:           ComeOn  5745
## 12:            12BET  5796
## 13:           SBOBET  5568
## 14:          Betsson   322
## 15:          Betsafe    44
## 16:           youwin    60
## 17:           Expekt    30

So we can select 5 bookmakers as 1xBet, bet365, Betfair Exchange, Pinnacle and 10Bet

bookmakers = c("1xBet", "bet365", "Betfair Exchange", "Pinnacle", 
    "10Bet")
func_1a = function(n_bm) {
    df = oo[bookmaker == bookmakers[n_bm]]
    pdf_1 = dcast(df[(betType != "ou") & (betType != "ah")], 
        matchId + bookmaker ~ betType + oddtype, value.var = c("odd"), 
        fun.aggregate = mean)
    
    # only choose ah = 0 due to different handicap means
    # different odds, so its not useful to mix different
    # handicaps
    x = df[(betType == "ah") & (totalhandicap == "0")]
    pdf_2 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"), 
        fun.aggregate = mean)
    # only choose ou = 2.5 due to different handicap means
    # different odds, so its not useful to mix different
    # handicaps
    x = df[(betType == "ou") & (totalhandicap == "2.5")]
    pdf_3 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"), 
        fun.aggregate = mean)
    pdf = na.omit(pdf_1[pdf_2, on = "matchId"][pdf_3, on = "matchId"])
    
    all_df = na.omit(pdf[mm[, c("matchId", "is_over")], on = "matchId"])
    all_df = all_df[, `:=`(is_over, as.character(is_over))]
    pca = prcomp(all_df[, 3:9], center = TRUE, scale. = TRUE)
    print(summary(pca))
    eigs = pca$sdev^2
    exp_var_ratio = eigs/sum(eigs)
    cum_exp_var_ratio = cumsum(exp_var_ratio)
    
    plot(cum_exp_var_ratio, type = "l", xlab = "# of Principle Components", 
        ylab = "Cumulative Explained Variance")
    title(paste("Cumulative Explained Variance Ratio of PCA for ", 
        bookmakers[n_bm], sep = ""))
    
    all_pca = predict(pca, newdata = all_df[, 3:9])
    all_pca3d = all_pca[, 1:3]
    all_pca3d = data.table(all_pca3d)
    all_pca3d[, `:=`(is_over, all_df$is_over)]
    # 3d plot with 1st, 2nd and 3rd components
    plot_ly(all_pca3d, x = ~PC1, y = ~PC2, z = ~PC3, colors = c("#132B43", 
        "#56B1F7"), color = ~is_over, type = "scatter3d", mode = "markers") %>% 
        layout(title = paste("Transformed Data with p = 3 PCA and is_over results", 
            bookmakers[n_bm], sep = " "))
}
func_1a(1)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     1.8620 1.4220 1.1528 0.31994 0.25363 0.09678
## Proportion of Variance 0.4953 0.2889 0.1898 0.01462 0.00919 0.00134
## Cumulative Proportion  0.4953 0.7842 0.9740 0.98863 0.99782 0.99916
##                            PC7
## Standard deviation     0.07657
## Proportion of Variance 0.00084
## Cumulative Proportion  1.00000

3d plot

func_1a(2)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     1.7096 1.5209 1.2876 0.21767 0.20100 0.12273
## Proportion of Variance 0.4175 0.3304 0.2369 0.00677 0.00577 0.00215
## Cumulative Proportion  0.4175 0.7480 0.9848 0.99161 0.99738 0.99953
##                            PC7
## Standard deviation     0.05739
## Proportion of Variance 0.00047
## Cumulative Proportion  1.00000

12Bet is very compact. It is hard to find patterns from the data by looking at the graph.

func_1a(3)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     1.7306 1.3078 1.2174 0.65847 0.53327 0.27811
## Proportion of Variance 0.4279 0.2443 0.2117 0.06194 0.04063 0.01105
## Cumulative Proportion  0.4279 0.6722 0.8839 0.94584 0.98647 0.99752
##                            PC7
## Standard deviation     0.13175
## Proportion of Variance 0.00248
## Cumulative Proportion  1.00000

Result of 188Bet is again hard predict. p=4 is a good choice for this bookmaker.

func_1a(4)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     1.8741 1.6091 0.8743 0.28487 0.16779 0.12440
## Proportion of Variance 0.5018 0.3699 0.1092 0.01159 0.00402 0.00221
## Cumulative Proportion  0.5018 0.8717 0.9809 0.99246 0.99648 0.99869
##                            PC7
## Standard deviation     0.09562
## Proportion of Variance 0.00131
## Cumulative Proportion  1.00000

The most interesting graph is bet-at-home’s pca visualization. Its a 3d V-shape. If you look at PC3 axis, projection of the points to the PC3 axis will be very close which makes interpretation harder compare to others. This interesting plot actually drives from explained variance ratio. With the first 3 components we can explain 96% variance of data which is a very good choice to pick for component number.

func_1a(5)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     1.9234 1.5186 0.71947 0.61145 0.23300 0.17029
## Proportion of Variance 0.5285 0.3295 0.07395 0.05341 0.00776 0.00414
## Cumulative Proportion  0.5285 0.8579 0.93188 0.98529 0.99305 0.99719
##                            PC7
## Standard deviation     0.14018
## Proportion of Variance 0.00281
## Cumulative Proportion  1.00000

At bet365 cumulative variance of the first 3 components are lower than bet-at-home. Distortion of the v shape also confirms that.

Calculation of manhattan and euclidian distances as well as 2D and 3D of them with MDS:

bookmakers = c("1xBet", "bet365", "Betfair Exchange", "Pinnacle", 
    "10Bet")
func_1bman = function(n_bm) {
    df = oo[bookmaker == bookmakers[n_bm]]
    pdf_1 = dcast(df[(betType != "ou") & (betType != "ah")], 
        matchId + bookmaker ~ betType + oddtype, value.var = c("odd"), 
        fun.aggregate = mean)
    
    x = df[(betType == "ah") & (totalhandicap == "0")]
    pdf_2 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"), 
        fun.aggregate = mean)
    # only choose ou = 2.5 due to different handicap means
    # different odds, so its not useful to mix different
    # handicaps
    x = df[(betType == "ou") & (totalhandicap == "2.5")]
    pdf_3 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"), 
        fun.aggregate = mean)
    
    pdf = na.omit(pdf_1[pdf_2, on = "matchId"][pdf_3, on = "matchId"])
    all_df = na.omit(pdf[mm[, c("matchId", "is_over")], on = "matchId"])
    
    all_df = all_df[, `:=`(is_over, as.character(is_over))]
    
    dist_man = dist(all_df[, 3:9], method = "manhattan")
    mds_man3 = data.table(cmdscale(dist_man, eig = TRUE, k = 3)$points)
    mds_man3[, `:=`(is_over, all_df$is_over)]
    
    plot_ly(mds_man3, x = ~V1, y = ~V2, z = ~V3, color = ~is_over, 
        type = "scatter3d", mode = "markers", domain = list(x = c(0, 
            1), y = c(0.5, 1))) %>% layout(title = paste("MDS Manhattan and is_over results", 
        bookmakers[n_bm], sep = " "))
}
bookmakers = c("1xBet", "bet365", "Betfair Exchange", "Pinnacle", 
    "10Bet")
func_1beuc = function(n_bm) {
    df = oo[bookmaker == bookmakers[n_bm]]
    pdf_1 = dcast(df[(betType != "ou") & (betType != "ah")], 
        matchId + bookmaker ~ betType + oddtype, value.var = c("odd"), 
        fun.aggregate = mean)
    
    x = df[(betType == "ah") & (totalhandicap == "0")]
    pdf_2 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"), 
        fun.aggregate = mean)
    # only choose ou = 2.5 due to different handicap means
    # different odds, so its not useful to mix different
    # handicaps
    x = df[(betType == "ou") & (totalhandicap == "2.5")]
    pdf_3 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"), 
        fun.aggregate = mean)
    
    pdf = na.omit(pdf_1[pdf_2, on = "matchId"][pdf_3, on = "matchId"])
    all_df = na.omit(pdf[mm[, c("matchId", "is_over")], on = "matchId"])
    
    all_df = all_df[, `:=`(is_over, as.character(is_over))]
    
    dist_euc = dist(all_df[, 3:9], method = "euclidian")
    mds_euc3 = data.table(cmdscale(dist_euc, eig = TRUE, k = 3)$points)
    mds_euc3[, `:=`(is_over, all_df$is_over)]
    
    plot_ly(mds_euc3, x = ~V1, y = ~V2, z = ~V3, color = ~is_over, 
        type = "scatter3d", mode = "markers", domain = list(x = c(0, 
            1), y = c(0, 0.5))) %>% layout(title = paste("MDS Euclidian and is_over results", 
        bookmakers[n_bm], sep = " "))
}
func_1beuc(1)
func_1bman(1)
func_1beuc(2)
func_1bman(2)
func_1beuc(3)
func_1bman(3)
func_1beuc(4)
func_1bman(4)
func_1beuc(5)
func_1bman(5)

- Part C

  1. 1x2 Analysis

Feature engineering for this part:

bookmakers = c("1xBet", "bet365", "Betfair Exchange", "Pinnacle", 
    "10Bet")
func_2a = function(n_bm) {
    df = oo[bookmaker == bookmakers[n_bm]]
    pdf_1 = dcast(df[(betType != "ou") & (betType != "ah")], 
        matchId + bookmaker ~ betType + oddtype, value.var = c("odd"), 
        fun.aggregate = mean)
    
    x = df[(betType == "ah") & (totalhandicap == "0")]
    pdf_2 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"), 
        fun.aggregate = mean)
    # only choose ou = 2.5 due to different handicap means
    # different odds, so its not useful to mix different
    # handicaps
    x = df[(betType == "ou") & (totalhandicap == "2.5")]
    pdf_3 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"), 
        fun.aggregate = mean)
    
    pdf = na.omit(pdf_1[pdf_2, on = "matchId"][pdf_3, on = "matchId"])
    
    mm[, `:=`(is_1x2, ifelse(home_goal > away_goal, "1", ifelse(home_goal == 
        away_goal, "x", "2")))]
    all_1x2 = na.omit(pdf[mm[, c("matchId", "is_1x2")], on = "matchId"])
    
    pca = prcomp(all_1x2[, 3:9], center = TRUE, scale. = TRUE)
    print(summary(pca))
    eigs = pca$sdev^2
    exp_var_ratio = eigs/sum(eigs)
    cum_exp_var_ratio = cumsum(exp_var_ratio)
    
    plot(cum_exp_var_ratio, type = "l", xlab = "# of Principle Components", 
        ylab = "Cumulative Explained Variance")
    title(paste("1x2 Results - Cumulative Explained Variance Ratio of PCA for ", 
        bookmakers[n_bm], sep = ""))
    
    all_pca = predict(pca, newdata = all_1x2[, 3:9])
    all_pca3d = all_pca[, 1:3]
    all_pca3d = data.table(all_pca3d)
    all_pca3d[, `:=`(is_1x2, all_1x2$is_1x2)]
    # 3d plot with 1st, 2nd and 3rd components
    plot_ly(all_pca3d, x = ~PC1, y = ~PC2, z = ~PC3, color = ~is_1x2, 
        type = "scatter3d", mode = "markers") %>% layout(title = paste("Transformed Data with p = 3 PCA and is_1x2 results", 
        bookmakers[n_bm], sep = " "))
}
func_2a(1)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     1.8620 1.4220 1.1528 0.31994 0.25363 0.09678
## Proportion of Variance 0.4953 0.2889 0.1898 0.01462 0.00919 0.00134
## Cumulative Proportion  0.4953 0.7842 0.9740 0.98863 0.99782 0.99916
##                            PC7
## Standard deviation     0.07657
## Proportion of Variance 0.00084
## Cumulative Proportion  1.00000

func_2a(2)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     1.7096 1.5209 1.2876 0.21767 0.20100 0.12273
## Proportion of Variance 0.4175 0.3304 0.2369 0.00677 0.00577 0.00215
## Cumulative Proportion  0.4175 0.7480 0.9848 0.99161 0.99738 0.99953
##                            PC7
## Standard deviation     0.05739
## Proportion of Variance 0.00047
## Cumulative Proportion  1.00000

func_2a(3)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     1.7306 1.3078 1.2174 0.65847 0.53327 0.27811
## Proportion of Variance 0.4279 0.2443 0.2117 0.06194 0.04063 0.01105
## Cumulative Proportion  0.4279 0.6722 0.8839 0.94584 0.98647 0.99752
##                            PC7
## Standard deviation     0.13175
## Proportion of Variance 0.00248
## Cumulative Proportion  1.00000

func_2a(4)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     1.8741 1.6091 0.8743 0.28487 0.16779 0.12440
## Proportion of Variance 0.5018 0.3699 0.1092 0.01159 0.00402 0.00221
## Cumulative Proportion  0.5018 0.8717 0.9809 0.99246 0.99648 0.99869
##                            PC7
## Standard deviation     0.09562
## Proportion of Variance 0.00131
## Cumulative Proportion  1.00000

func_2a(5)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     1.9234 1.5186 0.71947 0.61145 0.23300 0.17029
## Proportion of Variance 0.5285 0.3295 0.07395 0.05341 0.00776 0.00414
## Cumulative Proportion  0.5285 0.8579 0.93188 0.98529 0.99305 0.99719
##                            PC7
## Standard deviation     0.14018
## Proportion of Variance 0.00281
## Cumulative Proportion  1.00000

Calculation of manhattan and euclidian distances as well as 2D and 3D of them with MDS:

bookmakers = c("1xBet", "bet365", "Betfair Exchange", "Pinnacle", 
    "10Bet")
func_2beuc = function(n_bm) {
    df = oo[bookmaker == bookmakers[n_bm]]
    pdf_1 = dcast(df[(betType != "ou") & (betType != "ah")], 
        matchId + bookmaker ~ betType + oddtype, value.var = c("odd"), 
        fun.aggregate = mean)
    
    x = df[(betType == "ah") & (totalhandicap == "0")]
    pdf_2 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"), 
        fun.aggregate = mean)
    # only choose ou = 2.5 due to different handicap means
    # different odds, so its not useful to mix different
    # handicaps
    x = df[(betType == "ou") & (totalhandicap == "2.5")]
    pdf_3 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"), 
        fun.aggregate = mean)
    
    pdf = na.omit(pdf_1[pdf_2, on = "matchId"][pdf_3, on = "matchId"])
    mm[, `:=`(is_1x2, ifelse(home_goal > away_goal, "1", ifelse(home_goal == 
        away_goal, "x", "2")))]
    all_1x2 = na.omit(pdf[mm[, c("matchId", "is_1x2")], on = "matchId"])
    
    all_1x2 = all_1x2[, `:=`(is_1x2, as.character(is_1x2))]
    dist_euc = dist(all_1x2[, 3:9], method = "euclidian")
    mds_euc3 = data.table(cmdscale(dist_euc, eig = TRUE, k = 3)$points)
    mds_euc3[, `:=`(is_1x2, all_1x2$is_1x2)]
    
    plot_ly(mds_euc3, x = ~V1, y = ~V2, z = ~V3, color = ~is_1x2, 
        type = "scatter3d", mode = "markers") %>% layout(title = paste("MDS Euclidian and is_1x2 results", 
        bookmakers[n_bm], sep = " "))
}
bookmakers = c("1xBet", "bet365", "Betfair Exchange", "Pinnacle", 
    "10Bet")
func_2bman = function(n_bm) {
    df = oo[bookmaker == bookmakers[n_bm]]
    pdf_1 = dcast(df[(betType != "ou") & (betType != "ah")], 
        matchId + bookmaker ~ betType + oddtype, value.var = c("odd"), 
        fun.aggregate = mean)
    
    x = df[(betType == "ah") & (totalhandicap == "0")]
    pdf_2 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"), 
        fun.aggregate = mean)
    # only choose ou = 2.5 due to different handicap means
    # different odds, so its not useful to mix different
    # handicaps
    x = df[(betType == "ou") & (totalhandicap == "2.5")]
    pdf_3 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"), 
        fun.aggregate = mean)
    
    pdf = na.omit(pdf_1[pdf_2, on = "matchId"][pdf_3, on = "matchId"])
    mm[, `:=`(is_1x2, ifelse(home_goal > away_goal, "1", ifelse(home_goal == 
        away_goal, "x", "2")))]
    all_1x2 = na.omit(pdf[mm[, c("matchId", "is_1x2")], on = "matchId"])
    
    all_1x2 = all_1x2[, `:=`(is_1x2, as.character(is_1x2))]
    dist_man = dist(all_1x2[, 3:9], method = "manhattan")
    mds_man3 = data.table(cmdscale(dist_man, eig = TRUE, k = 3)$points)
    mds_man3[, `:=`(is_1x2, all_1x2$is_1x2)]
    
    plot_ly(mds_man3, x = ~V1, y = ~V2, z = ~V3, color = ~is_1x2, 
        type = "scatter3d", mode = "markers") %>% layout(title = paste("MDS Manhattan and is_1x2 results", 
        bookmakers[n_bm], sep = " "))
}
func_2bman(1)
func_2beuc(1)
func_2bman(2)
func_2beuc(2)
func_2bman(3)
func_2beuc(3)
func_2bman(4)
func_2beuc(4)
func_2bman(5)
func_2beuc(5)